Linear model to fit classifications
Why are the reporters classified as repressive/activating? Fit linear model to reporter classification
# Sub-select relevant data
cDNA_df_lm <- cDNA_df %>%
filter(neg_ctrls == "No", commercial_reporter == "No", promoter != "Random",
condition %in% c("HepG2_CDCA", "HepG2_DMSO", "K562_DMSO", "mES_2i",
"mES_2i_LIF", "mES_HQ", "mES_LIF_CH", "mES_LIF_PD",
"mES_N2B27"
),
tf %in% names(cDNA_df4)) %>%
mutate(background = as.factor(background)) %>%
dplyr::select(reporter_id, tf, reporter_activity_class, condition, promoter, spacing, background, distance) %>%
unique()
cDNA_df3[is.na(cDNA_df3)] <- 0
# Only select tfs for which differences in reporter activity were observed (for the other ones it doesn't make sense to fit models)
mixed_classes <- cDNA_df3 %>%
mutate(id = paste(condition, tf, sep = "_")) %>%
dplyr::select(-class) %>%
#filter(class != 0) %>%
unique() %>%
filter(`1` < 0.95, `0` < 0.95, `2` < 0.95)
# Select only relevant tfs from source data frame and only include ids with more than two entries
cDNA_df_lm$id <- paste(cDNA_df_lm$condition, cDNA_df_lm$tf, sep = "_")
cDNA_df_lm <- cDNA_df_lm %>%
mutate(sum = ave(id, id, FUN = function(x) length(x))) %>%
filter(sum >= 2, id %in% mixed_classes$id) %>%
dplyr::select(-sum)
# Fit linear model for each id (combination of condition and tf)
lm_features <- lapply(unique(cDNA_df_lm$id), function(x) lm(reporter_activity_class ~ promoter + spacing + background + distance,
cDNA_df_lm[cDNA_df_lm$id == x,]))
names(lm_features) <- unique(cDNA_df_lm$id)
## Extract and plot weights from linear model
lm_weights <- list()
for (i in names(lm_features)) {
lm_weights[[i]] <- lm_features[[i]]$coefficients
}
lm_weights <- bind_rows(lm_weights, .id = "id")
lm_weights <- lm_weights %>%
dplyr::select(-`(Intercept)`) %>%
pivot_longer(-id, names_to = "feature", values_to = "weight")
ggplot_custom(lm_weights, aes(x = feature, y = weight)) +
geom_quasirandom() +
theme_classic_lines_90() +
ggtitle('weights of all models')
## Warning: Removed 279 rows containing missing values (position_quasirandom).

## Extract and plot features from linear model
exp_features <- list()
for (i in names(lm_features)) {
exp_features[[i]] <- anova(lm_features[[i]]) %>%
dplyr::select('sum_sq' = `Sum Sq`) %>%
na.omit() %>%
mutate(sum = sum(sum_sq)) %>%
mutate(rel_sum_sq = (sum_sq/sum)*100) %>%
setnames("rel_sum_sq", "percent") %>%
dplyr::select("percent")
}
exp_features <- bind_rows(exp_features, .id = "id") %>%
rownames_to_column("feature") %>%
mutate(feature = gsub("...[0-9]{1-3}", "", feature)) %>%
mutate(feature = gsub("[0-9]", "", feature)) %>%
mutate(percent = format(percent, scientific = F, digits = 0)) %>%
mutate(percent = as.numeric(percent))
exp_features$group <- "no residual"
exp_features$group[grep("Residual", exp_features$feature)] <- "residual"
exp_features$percent[exp_features$feature == "Residuals"] <- 100 - exp_features$percent[exp_features$feature == "Residuals"]
exp_features$feature <- revalue(exp_features$feature, c("Residuals"="total"))
ggplot_custom(exp_features,
aes(x = percent, y = reorder(feature, -percent), fill = group)) +
geom_bar(stat = "identity") +
xlab("Variance explained (%)") +
ylab("") +
ggtitle('sum of features of all models')

## Plot features separately per tf activity class
classes <- cDNA_df3 %>%
mutate(id = paste(condition, tf, sep = "_")) %>%
dplyr::select(id, class) %>%
unique()
exp_features <- merge(exp_features, classes, by = "id")
ggplot_custom(exp_features,
aes(y = percent, x = feature, color = group)) +
geom_quasirandom() +
xlab("Variance explained (%)") +
ylab("") +
facet_wrap(~class) +
theme_classic_lines_90() +
ggtitle('features of all models per tf activity class')

## Discard poor performing linear models (<50% of variance explained)
exp_features_good <- exp_features %>%
filter(group == "residual", percent >= 50)
exp_features_sel <- exp_features[exp_features$id %in% exp_features_good$id,]
exp_features_sel <- exp_features_sel %>%
mutate(tf = gsub(".*_(.*)", "\\1", id),
condition = gsub("(.*)_.*", "\\1", id))
## Plot again the features
ggplot_custom(exp_features_sel,
aes(y = percent, x = feature, color = group)) +
geom_quasirandom() +
xlab("Variance explained (%)") +
ylab("") +
ggtitle('features of all models - only good models')

## Plot features individually per id
### Order the ids
order <- exp_features_sel %>%
mutate(sum = ave(percent, id, FUN = function(x) sum(x))) %>%
dplyr::select(sum, tf) %>%
unique() %>%
group_by(tf) %>%
top_n(1, sum)
exp_features_sel_ord <- merge(order, exp_features_sel, by = "tf") %>%
filter(feature != "total")
### As bar plot
ggplot(exp_features_sel_ord %>%
filter(condition %in% c("K562_DMSO", "HepG2_DMSO", "mES_2i_LIF")),
aes(y = percent, x = reorder(id, -sum), fill = feature, group = feature)) +
geom_bar(stat = "identity", position = "stack") +
xlab("Variance explained (%)") +
ylab("") +
scale_fill_manual(values = c("#e07a5f", "#3d405b", "#81b29a", "#f2cc8f"))+
theme_classic_lines_90() +
ggtitle('feature importance per tf & condition')

### As connected line-plot
ggplot_custom(exp_features_sel, aes(x=feature, y=percent, color=group, group = id)) +
geom_line() +
geom_point()+
ggtitle('feature importance per tf & condition')

### Only select the three main cell lines to simplify
exp_features_sel_heatmap <- exp_features_sel %>%
filter(condition %in% c("mES_2i_LIF", "K562_DMSO", "HepG2_DMSO")) %>%
mutate(cell_type = gsub("(.*)_.*", "\\1", condition)) %>%
mutate(cell_type = gsub("(.*)_.*", "\\1", cell_type)) %>%
dplyr::select(-group, -condition) %>%
filter(feature != "total") %>%
spread(feature, percent) %>%
column_to_rownames("id") #%>%
#arrange(tf)
cell_type <- exp_features_sel_heatmap %>%
dplyr::select(cell_type)
colors_pheatmap <- list('cell_type' = c("K562" = colors_diverse[1], "HepG2" = colors_diverse[5], "mES" = colors_diverse[4]))
pheatmap(as.matrix(t(exp_features_sel_heatmap %>% dplyr::select(-cell_type, -tf, -class))),
color = colorRampPalette(brewer.pal(n = 7, name = "Greys"))(100),
border_color = "white",
cellwidth = 8, cellheight = 8,
angle_col = 90, annotation_col = cell_type, cluster_cols = T,
annotation_colors = colors_pheatmap,
main = 'feature importance per tf & condition')

Comparison of TF reporters between conditions
Can we select the best reporter for each TF?
# Select conditions for which I have robust data, remove highly active reporters that I took over from library 1 and included in library 2 (because those distort the activity distributions)
cDNA_df2 <- cDNA_df %>%
filter(condition %in% c("HepG2_DMSO", "K562_DMSO", "mES_2i_LIF", "mES_2i", "mES_N2B27", "mES_LIF_PD", "mES_LIF_CH", "mES_HQ"), library != "1+2")
tfs_lib1 <- cDNA_df %>%
filter(library == "1") %>%
dplyr::select(tf) %>%
unique()
cDNA_df3 <- cDNA_df %>%
filter(condition %in% c("HepG2_DMSO", "K562_DMSO", "mES_2i_LIF", "mES_2i", "mES_N2B27", "mES_LIF_PD", "mES_LIF_CH", "mES_HQ"), library == "1+2")
cDNA_df3 <- cDNA_df3[!cDNA_df3$tf %in% tfs_lib1$tf,]
cDNA_df<- rbind(cDNA_df2, cDNA_df3)
## Compare the activities of TF reporters in different conditions
### Select TFs for which I have good models
reporter_correlations <- cDNA_df %>%
mutate(id = paste(condition, tf, sep = "_")) %>%
filter(id %in% cDNA_df_lm$id) %>%
dplyr::select(reporter_id, condition, tf, reporter_activity_minP) %>%
unique() %>%
spread(condition, reporter_activity_minP)
ggplot_custom(reporter_correlations %>%
dplyr::select(-mES_2i_LIF) %>%
na.omit(), aes(x = log2(K562_DMSO), y = log2(HepG2_DMSO))) +
geom_point() +
facet_wrap(~tf) +
geom_abline(linetype = "dashed") +
geom_smooth(method = "lm", color = "grey", fill = "lightgrey") +
ggtitle('comparison of reporter activities between conditions')
## `geom_smooth()` using formula 'y ~ x'

### Or for a single case: STAT3 upon JAK activation
reporter_correlations <- cDNA_df %>%
dplyr::select(reporter_id, condition, tf, reporter_activity_minP) %>%
unique() %>%
spread(condition, reporter_activity_minP)
ggplot_custom(reporter_correlations %>%
filter(tf == "STAT3") %>%
dplyr::select(mES_2i_LIF, mES_2i, tf, reporter_id) %>%
na.omit(), aes(x = log2(mES_2i_LIF), y = log2(mES_2i))) +
geom_point() +
geom_abline(linetype = "dashed") +
geom_smooth(method = "lm", color = "grey", fill = "lightgrey") +
ggtitle('STAT3 reporters upon JAK activation')
## `geom_smooth()` using formula 'y ~ x'

# Differences in sensitivity and specificity
ggplot_custom(reporter_correlations %>%
filter(tf == "TCF3") %>%
dplyr::select(mES_2i_LIF, mES_LIF_PD, tf, reporter_id) %>%
na.omit(), aes(x = log2(mES_2i_LIF), y = log2(mES_LIF_PD))) +
geom_point() +
geom_abline(linetype = "dashed") +
geom_smooth(method = "lm", color = "grey", fill = "lightgrey") +
ggtitle('TCF3 reporters upon WNT inactivation')
## `geom_smooth()` using formula 'y ~ x'

# Differences in specificity
ggplot_custom(reporter_correlations %>%
filter(tf == "NFE2L2") %>%
dplyr::select(mES_2i_LIF, mES_HQ, tf, reporter_id) %>%
na.omit(), aes(x = log2(mES_2i_LIF), y = log2(mES_HQ))) +
geom_point() +
geom_abline(linetype = "dashed") +
geom_smooth(method = "lm", color = "grey", fill = "lightgrey") +
ggtitle('NRF2 reporters upon oxidative stress induction')
## `geom_smooth()` using formula 'y ~ x'

# Clear case of design doesn't matter
Compare reporter activities to other measures
Comparison with TF expression and MPRA-inferred TF activities (Ernst et al.)
# Extract TF activities for K562 & HepG2
tf_activities <- cDNA_df %>%
filter(condition %in% c("K562_DMSO", "HepG2_DMSO"), neg_ctrls == "No") %>%
dplyr::select(reporter_id, reporter_activity_minP, condition, tf) %>%
unique() %>%
mutate(tf = gsub("_.*", "", tf)) %>%
spread(condition, reporter_activity_minP) %>%
mutate(dif = log2(K562_DMSO / HepG2_DMSO)) %>%
dplyr::select(tf, dif, reporter_id)
#write_csv2(tf_activities, "/DATA/usr/m.trauernicht/projects/SuRE-TF/ATAC_seq_hepg2/diffTF/tf_reporter_activities_all.csv")
# Correlate with RNA-seq
load('/DATA/usr/m.martinez.ara/GitLab/gurten/gurten/data/fc181121_epsure_tadec_rnaseq_expr_joshi.RData')
# remove Serum data and calculate mean expression
tib_fpkm_joshi$twoi <- (tib_fpkm_joshi$twoi_1 + tib_fpkm_joshi$twoi_2)/2
tib_fpkm_joshi <- tib_fpkm_joshi %>%
dplyr::select(gene_id, 'tf' = symbol, twoi)
tf_activities2 <- cDNA_df %>%
filter(condition == "mES_2i_LIF", neg_ctrls == "No") %>%
dplyr::select(reporter_id, reporter_activity_minP, tf) %>%
unique() %>%
mutate(tf = gsub("_.*", "", tf),
tf_activity = ave(reporter_activity_minP, tf, FUN = function(x) mean(x))) %>%
dplyr::select(tf_activity, tf) %>%
unique()
tf_activities2 <- merge(tf_activities2, tib_fpkm_joshi, by = "tf") %>%
mutate(tf_activity = (tf_activity-mean(tf_activity))/sd(tf_activity),
twoi = (twoi-mean(twoi))/sd(twoi))
ggplot_custom(tf_activities2, aes(x = twoi, y = tf_activity)) +
geom_point() +
ggtitle('TF reporter activities vs. TF expression in mESCs')

# Correlate with regulon data (VIPER, Aerts lab papers)
# Correlate with TF activities from Ernst et al. (https://doi.org/10.1038/nbt.3678)
ernst_activities <- read.table("/DATA/usr/m.trauernicht/projects/SuRE-TF/data/ernst_tf_activities_hepg2_k562.csv", sep = ";", dec = ",", header = T, fill = T)
ernst_activities <- ernst_activities %>%
dplyr::select('tf' = TF, activity_enrichment_HepG2, activity_enrichment_K562) %>%
mutate(avg_score_HepG2 = ave(activity_enrichment_HepG2, tf, FUN = function(x) mean(x)),
avg_score_K562 = ave(activity_enrichment_K562, tf, FUN = function(x) mean(x))) %>%
unique() %>%
pivot_longer(-tf, names_to = "condition", values_to = "ernst_activity") %>%
mutate(condition = gsub("avg_score_", "", condition))
tf_activities <- cDNA_df %>%
filter(condition %in% c("K562_DMSO", "HepG2_DMSO"), neg_ctrls == "No") %>%
dplyr::select(reporter_activity_minP, condition, tf) %>%
mutate(tf = gsub("_.*", "", tf),
condition = gsub("_DMSO", "", condition),
reporter_activity_minP = ave(reporter_activity_minP, condition, tf, FUN = function(x) mean(x))) %>%
unique()
ernst_activities <- merge(ernst_activities, tf_activities, by = c('tf', 'condition')) %>%
mutate(ernst_activity = (ernst_activity-mean(ernst_activity))/sd(ernst_activity),
reporter_activity_minP = (reporter_activity_minP-mean(reporter_activity_minP))/sd(reporter_activity_minP))
ggplot_custom(ernst_activities,
aes(x = ernst_activity, y = reporter_activity_minP)) +
geom_point() +
geom_abline(linetype = "dashed") +
geom_smooth(method = "lm", color = "grey", fill = "lightgrey") +
facet_wrap(~condition) +
ggtitle('TF reporter activities vs. MPRA-chunk-inferred TF activities')
## `geom_smooth()` using formula 'y ~ x'

#write_csv2(ernst_activities, "/DATA/usr/m.trauernicht/projects/SuRE-TF/ATAC_seq_hepg2/diffTF/tf_reporter_activities_ernst.csv")
Compare reporter activities to ATAC-seq
how does TF reporter activity relate to diffTF inferred TF activity?
# DiffTF was run in basic and classification mode (https://doi.org/10.1016/j.celrep.2019.10.106), final output is imported:
## Input peaks for diffTF were selected in separate script
difftf_activity <- read_tsv("/DATA/usr/m.trauernicht/projects/SuRE-TF/ATAC_seq_hepg2/diffTF/output_rna_top1000_inverse/FINAL_OUTPUT/extension100/HepG2vsK562.top1000.summary.tsv") %>%
dplyr::select(TF, 'diffTF_activity' = weighted_meanDifference, 'classification' = classification_q0.001_final)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## TF = col_character(),
## weighted_meanDifference = col_double(),
## weighted_CD = col_double(),
## TFBS = col_double(),
## weighted_Tstat = col_double(),
## variance = col_logical(),
## pvalue = col_double(),
## Cohend_factor = col_double(),
## pvalueAdj = col_double(),
## median.cor.tfs = col_double(),
## classification_q0.1 = col_character(),
## classification_q0.05 = col_character(),
## classification_q0.01 = col_character(),
## classification_q0.001 = col_character(),
## classification_distr_rawP = col_double(),
## classification_q0.1_final = col_character(),
## classification_q0.05_final = col_character(),
## classification_q0.01_final = col_character(),
## classification_q0.001_final = col_character()
## )
difftf_activity$classification[difftf_activity$TF == "POU5F1::SOX2"] <- difftf_activity$classification[difftf_activity$TF == "POU5F1"]
difftf_activity$classification[difftf_activity$TF == "RAR:RXR"] <- difftf_activity$classification[difftf_activity$TF == "RARA"]
# Import mean reporter activities per TF
reporter_activity_all <- tf_activities
reporter_activity_all_mean <- reporter_activity_all %>%
unique() %>%
pivot_wider(names_from = "condition", values_from = "reporter_activity_minP") %>%
mutate(dif_reporter = log2(K562 / HepG2)) %>%
dplyr::select(dif_reporter, 'TF' = tf) %>%
unique()
reporter_activity_all_mean$TF <- gsub("::.*", "_", reporter_activity_all_mean$TF)
reporter_activity_all_mean$TF[reporter_activity_all_mean$TF == "POU5F1_"] <- "POU5F1::SOX2"
reporter_activity_all_mean$TF[reporter_activity_all_mean$TF == "RARA:RXRA"] <- "RAR:RXR"
reporter_activity_all_mean$TF[reporter_activity_all_mean$TF == "ETS1"] <- "ETS2"
reporter_activity_all_mean$TF[reporter_activity_all_mean$TF == "NFATc1"] <- "NFATC1"
reporter_activity_all_mean$TF[reporter_activity_all_mean$TF == "TCF3"] <- "TCF7L2"
reporter_activity_all_mean$TF <- gsub("_", "", reporter_activity_all_mean$TF)
activity_comp <- merge(difftf_activity, reporter_activity_all_mean, by = "TF", all = T) %>%
na.omit()
activity_comp <- activity_comp %>%
mutate(diffTF_activity_norm = (diffTF_activity-mean(diffTF_activity))/sd(diffTF_activity),
reporter_activity_norm = (dif_reporter-mean(dif_reporter))/sd(dif_reporter))
# Import class of TF
tf_class <- read_csv2("/DATA/usr/m.trauernicht/projects/SuRE-TF/data/library_design/lambert_2018/garcia_tf-properties.csv") %>%
dplyr::select(TF, chromatin_regulation_mode)
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## TF = col_character(),
## mode_of_regulation = col_character(),
## DNA_binding_mode = col_character(),
## DNA_binding_specificity = col_character(),
## chromatin_regulation_mode = col_character(),
## TF_superclass = col_character(),
## TF_class = col_character(),
## tissue_of_expression = col_character()
## )
activity_comp <- merge(activity_comp, tf_class, all = T)
activity_comp$chromatin_regulation_mode[activity_comp$TF == "POU5F1::SOX2"] <- activity_comp$chromatin_regulation_mode[activity_comp$TF == "POU5F1"]
activity_comp$chromatin_regulation_mode[activity_comp$TF == "RAR:RXR"] <- activity_comp$chromatin_regulation_mode[activity_comp$TF == "RARA"]
activity_comp <- activity_comp %>% na.omit()
ggplot(activity_comp, aes(x = reporter_activity_norm, y = diffTF_activity_norm)) +
geom_point(aes( color = chromatin_regulation_mode)) +
geom_abline(linetype = "dashed") +
geom_smooth(method = "lm", color = "grey", fill = "lightgrey") +
xlab("Differential Reporter Activity (K562/HepG2)") +
ylab("Differential Accessibility - diffTF (K562/HepG2)") +
scale_color_manual(values = c("grey",colors_diverse[c(2,5,4)])) +
facet_wrap(~classification) +
ggtitle('TF reporter activity vs. diffTF activity')
## `geom_smooth()` using formula 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

Plotting mean TF activities per condition
display actual activity values per condition and TF instead of fraction of active reporters per TF
## Prepare dataframe
# Caculate mean TF activity per condition
tf_activity_heatmap <- cDNA_df %>%
filter(neg_ctrls == "No", commercial_reporter == "No") %>%
mutate(tf_activity = ave(reporter_activity_minP, condition, tf, FUN = function(x) mean(x))) %>%
#mutate(tf_activity = log2(tf_activity)) %>%
#mutate(tf_activity = ave(tf_activity, condition, FUN = function(x) (x-mean(x))/sd(x))) %>%
dplyr::select(tf_activity, condition, tf) %>%
unique() %>%
spread(condition, tf_activity) %>%
remove_rownames %>%
column_to_rownames(var="tf")
# Keeping the scale in the pheatmap function
myBreaks1 <- seq(0,6,0.06)
# Keep only TFs that are in at least one condition above a certain threshold
tf_activity_heatmap <- tf_activity_heatmap[rowSums(tf_activity_heatmap > 4) >= 1,] %>%
na.omit()
#tf_activity_heatmap <- tf_activity_heatmap[,c(2,1,6,7,3,8,4,5)]
# pheatmap function
pheatmap(as.matrix(t(tf_activity_heatmap)),
color = colorRampPalette(brewer.pal(n = 7, name = "Greys"))(100),
breaks = myBreaks1, border_color = "black",
cellwidth = 8, cellheight = 8,
angle_col = 90,
main = 'reporter activities per tf and condition')
This heatmap doesn’t take significance into consideration and to me is less representative of tf activity than the fraction of reporter activity classes heatmap